Introduction

The Species Distribution Model (SDM) Benchmark Study Part 2 presents data from eight bird species across Colorado, Oregon, North Carolina, and Vermont, using environmental rasters from Part 1. The analysis segments data distribution by state and species and evaluates explanatory rasters using methods like Principal Component Analysis and Factor Analysis for Mixed Data. Additionally, the study addresses pseudo-absence generation and conducts feature engineering/selection to prepare for subsequent modeling.

Visualize all Observations by State & Species

The initial phase of the exploratory data analysis visualizes bird species observations across four U.S. states: Oregon, Vermont, Colorado, and North Carolina, covering the period from 2016 to 2019. These visual representations display the distribution of eight distinct bird species within each state while also highlighting major cities for geographical context. Such visualizations offer insights into spatial distribution patterns, potentially revealing habitat preferences, migration routes, or the influence of urban areas on these distributions. As the analysis progresses, understanding these distributions becomes pivotal, especially when addressing spatial autocorrelations and other features that influence the accuracy and validity of the Species Distribution Model (SDM) benchmarks.

data(us.cities)

# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
  out <- us.cities %>% 
  filter(country.etc==s) %>%
  mutate(city = gsub(paste0(" ", s), "", name)) %>%
  arrange(-pop)
  if (s == "OR") {
    out <- out %>% 
      head() %>%
      filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
                           "Beaverton", "Springfield")))
  } else if (s == "CO") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
  } else if (s == "NC") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
  } else {
    out <- out %>% head()
  }
  out
})

# Load the map data
states <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))

# Load your data
data.files <- list.files("data/final", full.names = T)

df <- purrr::map_df(data.files, readRDS) 

caps.after.ws <- function(string) {
  gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}

# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
  st <- case_when(st.abbr == "CO" ~ "colorado",
                  st.abbr == "NC" ~ "north carolina",
                  st.abbr == "VT" ~ "vermont",
                  st.abbr == "OR" ~ "oregon",
                  T ~ "")
  
  title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec), 
                             "Observations, 2016-2019"))
  
  p <- ggplot(data = states %>% filter(region == st)) +
    geom_polygon(aes(x = long, y = lat, group = group),
                 fill = "#989875", color = "black") +
    geom_point(data = df %>% filter(state == st.abbr & common.name == spec), 
               aes(x = lon, y = lat), 
               size=1, alpha=.5, fill = "red", shape=21) +
    geom_point(data = top.cities %>% filter(country.etc == st.abbr), 
               aes(x=long, y=lat),
               fill="gold", color="black", size=3.5, shape = 21) + 
    geom_text(data = top.cities %>% filter(country.etc == st.abbr), 
              aes(x=long, y=lat, label=city),
              color="white", hjust=case_when(st.abbr=="NC"~.2,
                                               st.abbr=="VT"~.65,
                                               T~.5),
              vjust=ifelse(st.abbr=="NC", -.65, 1.5),
              size=4) + 
    coord_map() +
    ggtitle(title) +
    theme_minimal() +
    theme(panel.background = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          panel.grid = element_blank())

  data.table(
    state=st.abbr,
    species=spec,
    plot=list(p)
  )
}

spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
  rename(spec=Var1, st.abbr=Var2) 

# Create a list of plots
plots <- purrr::map2_df(spec.state$spec, 
                        spec.state$st.abbr, 
                        ~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Ruddy Duck"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Belted Kingfisher"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Wild Turkey plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Wild Turkey"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sharp-shinned Hawk"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Downy Woodpecker"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Cedar Waxwing"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sandhill Crane"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sanderling Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sanderling"]$plot, 
          list(nrow=2, ncol=2)))

Explore Explanatory Rasters

states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states

TODO: - terra::freq - terra::density - terra::layerCor

Principal Component Analysis

In this section, the study employs Principal Component Analysis (PCA) to transform the high-dimensional data into a lower-dimensional form while retaining as much of the original variance as possible. The initial step involves merging data from different states into a single dataframe, followed by cleaning factor levels of the NLCD_Land column. Subsequently, the code converts factor columns into dummy variables and removes columns with only a single unique value. The PCA() function is used to perform the PCA, and the results are visualized using bar charts, displaying the importance of each variable for the first two dimensions. This visualization is important in understanding the significance of different environmental rasters and how they contribute to the variations captured by the principal components.

r.df <- map_df(states, function(s) {
  df <- r.list[[s]] %>% as.data.frame()
  names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
  df %>% setDT()
  df[, state := factor(s, levels=states)]
  df[apply(df, 1, function(.x) !any(is.na(.x)))]
}) 

# Custom function to process factor levels
clean.levels <- function(x) {
  # Remove non-alphanumeric characters and replace with underscores
  x <- gsub("[^a-zA-Z0-9]", "_", x)
  # Convert to lowercase
  x <- tolower(x)
  # Remove any leading or trailing underscores
  x <- gsub("^_|_$", "", x)
  x <- gsub("__", "_", x)
  x <- gsub("NLCD_Land_", "", x)
  return(x)
}

r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]

# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, 
                                      data = r.df[, .(NLCD_Land, state)])) %>%
  cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F]) 

names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))

# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
  as.data.frame() %>%
  filter(V1 == 1) %>%
  row.names()

if (length(uniq.1) >= 1) {
  df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}


pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")

res <- pca.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Factor Analysis for Mixed Data

Diving deeper into the structure of the data, Factor Analysis for Mixed Data (FAMD) is implemented next. FAMD is a unique technique that extends traditional Factor Analysis by handling both continuous and categorical data, which fits the nature of the dataset used in this study. The code initiates this process by applying the FAMD() function to the dataframe. The results are then visualized in three distinct plots that highlight the variance explained by different variables for both continuous and categorical data. These plots provide insights into the underlying structure of the data and the significance of each variable. By observing the variable importance for the first two dimensions, displayed through bar charts, it can be discern which variables are related. Later, multi-collinearity and autocorrelation will be addressed in greater depth.

famd.fit <- FAMD(r.df, graph=F)

ggpubr::ggarrange(plotlist=purrr::map(
  c("var", "quanti", "quali"), 
  ~plot.FAMD(famd.fit, choix=.x)),
  ncol=1, nrow=3)

res <- famd.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Pseudo-Absence Generation

In species distribution modeling, the presence of a species in certain locations is often well-recorded, but the absence is typically under-reported or not reported at all. This creates a challenge when trying to understand the complete distribution of a species. To address this, the concept of pseudo-absence data is introduced. Pseudo-absence data are artificially generated points that represent locations where the species is assumed not to be present. In this section, pseudo-absence data points are generated for the eight bird species across the four states ensuring that these points do not overlap with observed presence data. The generated pseudo-absence data helps in providing a more comprehensive view of the species distribution and serves as an essential component for the upcoming modeling process.

Buffering Raster Data

For the method used for generating pseudo-absence data in this analysis, initially a buffer is created around each observation point, each with a 5000 meter radius. This buffer essentially “blocks” the regions of the sample-area from where pseudo-absence points can be sampled. I.e., only the non-buffered zones can be sampled from.

# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)

# LOAD DATA 

# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states

# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# BUFFERING RASTER DATA 

dist <- 5e3

mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
                        dist=10000, u="m") {
  obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
  # Create a buffer around the point, ensuring correct units
  buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
  return(terra::rasterize(buf, mask.raster, update=T, field=1))
}

# For each observation point, you can now create a distance 
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
                               dist=10000, u="m") {
  # Create an empty raster with the same extent and resolution as r
  mask.raster <- terra::rast(r)
  # Recursively update mask.raster with additional buffered regions
  for(i in 1:nrow(obs.df)) {
    mask.raster <- mask.update(i, mask.raster, obs.df, 
                               obs.field=obs.field, 
                               dist=dist, u=u)
    gc()
  }
  return(mask.raster)
}

# Get masks by state, species
masks <- purrr::map(states, function(state) {
    specs <- sort(unique(obs.df$common.name))
    spec.masks <- purrr::map(specs, function(spec, st=state) {
      fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
      if (file.exists(fname)) {
        cat("Reading", spec, st, "mask from", fname, "\n")
        r.mask <- rast(fname)
      } else {
        cat("Computing", spec, st, "mask, and saving to", fname, "\n")
        r.mask <- get.buffered.zones(r=r.list[[st]], 
                                     obs.df=filter(obs.df, state == st & 
                                                     common.name == spec),
                                     dist=dist)
        terra::writeRaster(r.mask, fname, overwrite=T)
      }
      gc()
      r.mask
    }, .progress=T)
    names(spec.masks) <- specs
    spec.masks
  })
names(masks) <- states

Sampling Pseudo-Absences and Comparing Totals

This section identifies areas outside the buffers as potential zones for pseudo-absences. To ensure an accurate representation, a random sampling mechanism is implemented. For each bird species in a state, an equivalent number of pseudo-absence points (or a pre-defined minimum threshold), based on observed data are generated. The result is a set of coordinates representing regions where the bird species have not been observed.

# Set seed
set.seed(19)

# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n, 
                                species, state,
                                sample.crs=4326, min.n=500,
                                output.dir="artifacts/pseudo_absence_regions") {
  if (!dir.exists(output.dir)) dir.create(output.dir)
  output.path <- file.path(output.dir,
                           gsub(" |\\-", "_", 
                                paste0(
                                  paste(state, tolower(species), sep="_"), 
                                  ".tif")
                           ))
  if (!file.exists(output.path)) {
    # Get inverse mask;
    # Set NA cells to 0, keep 0 cells as 0, change other cells to 1
    r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
    # Set 0 to 1 and everything else to NA
    r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
    # Crop so that anything outside of the state is NA
    r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
    
    # Create a binary raster from r.original where valid values are 
    # set to 1 and NA values remain NA
    r.binary <- terra::lapp(r.original[[1]], 
                            fun = function(x) ifelse(!is.na(x), 1, NA))
    
    # Multiply the cropped raster by the binary raster to ensure 
    # outside values are set to NA
    r.final <- r.cropped * r.binary
    terra::writeRaster(r.final, output.path, overwrite=T)
  } else {
    r.final <- terra::rast(output.path)
  }
  
  # Convert the raster to SpatialPoints
  gdf <- terra::as.points(r.final) %>%
    st_as_sf() %>%
    st_transform(crs=sample.crs)
  if (nrow(gdf) > 0) {
    gdf <- gdf %>%
      filter(!is.na(layer)) %>%
      select(geometry)
  } else {
    return(gdf)
  }
  
  # Set to min.n size if n < min.n
  if (n < min.n) n <- min.n
  # Make sure there is sufficient available sample points
  if (n > nrow(gdf)) n <- nrow(gdf)
  
  # Randomly sample n points from the available (non-masked) space
  sample.idx <- sample(1:nrow(gdf), n)
  samples <- gdf[sample.idx,] %>%
    mutate(common.name = species, 
           state = state, 
           lon = NA, 
           lat = NA,
           observations=0)
  
  # Populate lon and lat values:
  coords <- st_coordinates(samples)
  samples$lon <- coords[, "X"]
  samples$lat <- coords[, "Y"]
  
  return(samples)
}

# Get totals by species and state
totals <- obs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(N=n(), .groups="keep")


if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
  dir.create(file.path("data", "final_pseudo_absence"))
}

if (!all(
  file.exists(
    paste0(file.path("data", "final_pseudo_absence", 
                     paste0("pa_", states, ".rds")))
  )
)) {
  # Create a list of pseudo absence points, by species and state,
  # where the sample number `n` >= 500 | `n` == the total observed
  # for each respective state and species
  pseudo.absence.pts <- list()
  for (st in states) {
    r.original <- r.list[[st]]
    r.masks <- masks[[st]]
    pseudo.absence.pts[[st]] <- list()
    for (spec in names(r.masks)) {
      r.mask <- r.masks[[spec]]
      n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
      cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
      pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
        r.original, r.mask, spec, st, n=n, sample.crs=4326)
      cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
    }
  }
  
  # Extract raster values for each point
  for (state in states) {
    out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
    if (!file.exists(out.file.all)) {
      r <- r.list[[state]]
      
      cat(sprintf("Extracting points to values for %s...\n", state))
      # Load observations shapefile
      geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
      rownames(geo.df) <- NULL
      
      geo.df.crs <- st_crs(geo.df)
      
      # Define target CRS and update
      target.crs <- st_crs(r)
      cat(sprintf("Updating CRS for %s...\n", state))
      geo.df <- st_transform(geo.df, target.crs)
      
      # Extract raster values
      for (r.name in names(r)) {
        cat("\tExtracting", r.name, "values for", state, "\n")
        x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
        geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
      }
      
      # Update crs back
      geo.df <- st_transform(geo.df, geo.df.crs)
      
      # Fix names; Filter NA values
      geo.df <- geo.df %>%
        filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
        suppressWarnings() 
      
      saveRDS(geo.df, out.file.all)
      cat("--------------\n")
    }
  }
}

The table below compares the generated pseudo-absence data with the observed dataset. This comparison ensures that the number of pseudo-absence points matches the observed ones, balancing the dataset and making it ready for the next analysis steps.

# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# There might be some slight differences since there are occasionally NA values
abs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(psuedo.absence.N=n(), .groups="keep") %>%
  left_join(totals, by=c("state", "common.name")) %>%
  rename(observed.N = N) %>%
  knitr::kable()
state common.name psuedo.absence.N observed.N
CO Belted Kingfisher 4523 4551
CO Cedar Waxwing 3431 3446
CO Downy Woodpecker 7440 7511
CO Ruddy Duck 1715 1726
CO Sanderling 497 131
CO Sandhill Crane 1524 1532
CO Sharp-shinned Hawk 2241 2257
CO Wild Turkey 2597 2611
NC Belted Kingfisher 4042 4183
NC Cedar Waxwing 4059 4195
NC Downy Woodpecker 10415 10914
NC Ruddy Duck 1076 1106
NC Sanderling 488 311
NC Sandhill Crane 489 118
NC Sharp-shinned Hawk 1211 1254
NC Wild Turkey 2261 2372
OR Belted Kingfisher 5803 5837
OR Cedar Waxwing 8405 8446
OR Downy Woodpecker 8529 8576
OR Ruddy Duck 1996 2010
OR Sanderling 496 258
OR Sandhill Crane 2443 2458
OR Sharp-shinned Hawk 2696 2714
OR Wild Turkey 2429 2440
VT Belted Kingfisher 1956 2033
VT Cedar Waxwing 1098 3938
VT Downy Woodpecker 1598 4635
VT Ruddy Duck 490 51
VT Sanderling 493 39
VT Sandhill Crane 492 76
VT Sharp-shinned Hawk 730 748
VT Wild Turkey 2090 2181

Identifying Spatial Autocorrelation

Moran’s I

This is a common measure of global spatial autocorrelation. A positive Moran’s I suggests clustering, a negative value suggests dispersion, and a value near zero suggests randomness. In other words, this is randomization approach to test for spatial autocorrelation. It is essentially checking if the data has a spatial pattern that is significantly different from what would be expected if the values were randomly distributed in space.

Understanding the Results

Moran I statistic: This is the calculated Moran’s I value for the data, which quantifies the degree of spatial autocorrelation. A positive value indicates positive autocorrelation (similar values are closer together), while a negative value indicates negative autocorrelation (dissimilar values are closer together). A value close to zero indicates a random spatial pattern.

Moran I statistic standard deviate: This value represents the standardized Moran’s I value. The larger the absolute value of this statistic, the stronger the evidence against the null hypothesis (of no spatial autocorrelation). A positive value indicates positive spatial autocorrelation, suggesting clustering of similar values.

P-value: This is the probability of observing a Moran’s I value as extreme as, or more extreme than, the one computed from the data, assuming the null hypothesis of no spatial autocorrelation is true. A very small p-value provides strong evidence to reject the null hypothesis, indicating significant spatial autocorrelation in your data.

Expectation: This is the expected Moran’s I value under the null hypothesis of no spatial autocorrelation. For a large dataset, it’s typically close to zero.

Variance: This is the variance of the Moran’s I statistic under the null hypothesis.

Spatial Autocorrelation of Observations

if (!dir.exists("artifacts/obs_autocor_morans")) {
  dir.create("artifacts/obs_autocor_morans")
}

# Get observation data
df <- list.files(file.path("data", "final"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observations, geometry) # We only interested in these data here

states <- sort(unique(df$state))
species <- sort(unique(df$common.name))

# Function to extract the desired results from output
extract.data <- function(st, spec, results) {
  data_frame(
    state = st,
    species = spec,
    statistic = results$statistic,
    p.value = results$p.value,
    moran.i.statistic = results$estimate['Moran I statistic'],
    expectation = results$estimate['Expectation'],
    variance = results$estimate['Variance']
  )
}

# Define the k for knn computation
k <- 5

if (!file.exists("artifacts/obs_autocor_morans/mi.results.rds")) {
  # Perform test by state, by species
  mi.results <- list()
  
  for (st in states) {
    mi.results[[st]] <- list()
    for (spec in species) {
      cat("Doing Moran's test for", spec, "in", st, "\n")
      # Filter data
      d <- df %>% filter(common.name == spec & state == st)
      mi.results[[st]][[spec]]$data <- d
      # Fit knn model
      knn <- d %>%
        knearneigh(k = k)
      mi.results[[st]][[spec]]$knn <- knn
      # Create a neighbor's list
      nb <- knn %>%
        knn2nb()
      mi.results[[st]][[spec]]$nb <- nb
      # Create a spatial weights matrix
      listw <- nb2listw(nb)
      mi.results[[st]][[spec]]$listw <- listw
      # Compute Moran's I
      results <- moran.test(d$observations, listw)
      mi.results[[st]][[spec]]$moran.test.results <- extract.data(st, spec, results)
    }
  }
  saveRDS(mi.results, "artifacts/obs_autocor_morans/mi.results.rds")
} else {
  mi.results <- readRDS("artifacts/obs_autocor_morans/mi.results.rds")
}

spec.stat <- expand.grid(species=species, state=states, stringsAsFactors=F)

mi.results.df <- purrr::map(1:nrow(spec.stat), function(i) {
    spec <- spec.stat[i, ]$species
    st <- spec.stat[i, ]$state
    mi.results[[st]][[spec]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

The Moran’s I statistic for the Belted Kingfisher and Cedar Waxwing are positive in Oregon, as well as the Sharp-shinned Hawk in both Vermont and Oregon. This suggests that locations with high observations of these species are close to other locations with high observations, and the same for low observations. In simple terms, the observation patterns of these species are clustered.

Below is an example of the actual observations of the Sharp-shinned Hawk in Vermont against the spatial lag (i.e., the weighted sum of the neighboring values at each point). Consider the following when interpreting the plot:

  • First Quadrant (top-right): High values surrounded by high values (indicating clustering of high values).
  • Second Quadrant (top-left): Low values surrounded by high values.
  • Third Quadrant (bottom-left): Low values surrounded by low values (indicating clustering of low values).
  • Fourth Quadrant (bottom-right): High values surrounded by low values.
# Get spatial weights list
vt.ssh <- mi.results[["VT"]][["Sharp-shinned Hawk"]]$data

# Calculate the spatial lag of the observations
vt.ssh$spatial.lag <- lag.listw(
  mi.results[["VT"]][["Sharp-shinned Hawk"]]$listw,
  vt.ssh$observations)

# Scatter plot for the Sharp-Shinned Hawk in Vermont
ggplot(vt.ssh, aes(x=observations, y=spatial.lag)) +
  geom_point() +
  geom_smooth(method="lm", col="red") + # Adds a linear regression line
  ggtitle("Moran Scatter Plot for Sharp-shinned Hawk in VT") +
  xlab("Observations (log10 Scale)") +
  ylab("Spatial Lag of Observations (log10 Scale)") +
  scale_y_log10() + scale_x_log10()

Spatial Autocorrelation of Environmental Factors

env.df.list <- list()
for (state in states) {
  r <- r.list[[state]]
  
  gdf <- terra::as.points(r) %>% 
    st_as_sf() %>%
    st_transform(crs=4326)
  # Fix names
  names(gdf) <- gsub(paste0("_", state, "$"), "", names(gdf))
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=gdf, sep=".") %>% 
    predict(., gdf) %>%
    as.data.frame()
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  gdf <- gdf %>% select(-NLCD_Land) %>% cbind(binary.lc)
  
  env.df.list[[state]] <- gdf
}

# Perform test by state, by environmental variable
env.mi.results <- list()

output.dir <- file.path("artifacts", "env_autocor_morans")
if (!dir.exists(output.dir)) dir.create(output.dir)

for (st in states) {
  env.mi.results[[st]] <- list()
  gdf <- env.df.list[[st]]
  env.vars <- names(gdf)[names(gdf) != "geometry"]
  for (e in env.vars) {
    output.path <- file.path(output.dir, paste0(st, "_", e, ".rds"))
    if (!file.exists(output.path)) {
      cat("Doing Moran's test for", e, "in", st, "\n")
      # Filter data
      d <- gdf %>% select(!!sym(e))
      # env.mi.results[[st]][[e]]$data <- d
      n.distinct <- d %>% pull(!!sym(e)) %>% unique() %>% length()
      if (n.distinct > 1) {
        # Fit knn model
        knn <- d %>%
          knearneigh(k = k)
        # env.mi.results[[st]][[e]]$knn <- knn
        # Create a neighbor's list
        nb <- knn %>%
          knn2nb()
        # env.mi.results[[st]][[e]]$nb <- nb
        # Create a spatial weights matrix
        listw <- nb2listw(nb)
        # env.mi.results[[st]][[e]]$listw <- listw
        # Compute Moran's I
        results <- moran.test(d[[e]], listw)
        env.mi.results[[st]][[e]]$moran.test.results <- extract.data(st, e, results)
      }
      cat("\tSaving results...\n")
      saveRDS(env.mi.results[[st]][[e]]$moran.test.results, output.path)
    } else {
      cat("Getting Moran's test results for", e, "in", st, "\n")
      env.mi.results[[st]][[e]]$moran.test.results <- readRDS(output.path)
    }
  }
}


env.stat <- expand.grid(env=names(env.df.list$CO )[names(env.df.list$CO) != "geometry"], 
                        state=states, stringsAsFactors=F)

env.mi.results.df <- purrr::map(1:nrow(env.stat), function(i) {
    e <- env.stat[i, ]$env
    st <- env.stat[i, ]$state
    env.mi.results[[st]][[e]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
env.mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

The weather layers in particular have Moran’s I values close to 1, which is the maximum possible value. This indicates a very strong spatial clustering of these variables.

Mitigating Potential Problems Due to Spatial Autocorrelation

  1. Collinearity
  2. Spatial Autocorrelation in Residuals
  3. Overfitting
  4. Non-Stationarity
Checking for Collinearity in Environmental Factors
all.columns <- unique(unlist(lapply(env.df.list, names)))
env.df <- lapply(env.df.list, function(df) {
  missing.cols <- setdiff(all.columns, names(df))
  for (col in missing.cols) {
    df[[col]] <- NA
  }
  return(df)
}) %>%
  do.call("rbind", .) %>%
  as.data.frame() %>%
  select(-geometry, -unclassified)

corr.matrix <- cor(env.df, use="na.or.complete")
eigenvalues <- eigen(corr.matrix)$values
ci.df <- tibble(
  variable=names(env.df),
  condition.index=sqrt(max(eigenvalues) / eigenvalues)
)

ci.df
# Extract the eigenvectors
eigenvectors <- eigen(corr.matrix)$vectors

threshold <- 30
large.ci.indices <- which(sqrt(max(eigenvalues) / eigenvalues) > threshold)

# Examine the eigenvectors
for (index in large.ci.indices) {
  cat(paste("Eigenvalue:", eigenvalues[index]), "\n")
  cat(paste("Condition Index:", 
                  sqrt(max(eigenvalues) / eigenvalues[index])), "\n")
  
  # Sorting eigenvector components by absolute magnitude for clarity
  ev <- eigenvectors[, index]
  sorted.ev <- sort(abs(ev), decreasing = T)
  ev.top <- sorted.ev[1:5] %>%
    as_tibble() %>%
    mutate(variable=rownames(corr.matrix)[order(-abs(ev))][1:5]) %>%
    select(variable, value)
  
  cat("\nTop 5 contributors to multicollinearity at the condition idx:\n")
  
  for (row in 1:5) {
    cat("\t", ev.top[row,]$variable, ":", signif(ev.top[row,]$value, 3), "\n")
  }
  
  cat("\n------------------------\n")
}
## Eigenvalue: 0.00135666182336175 
## Condition Index: 65.3042479620253 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   avg_prcp : 0.814 
##   tmax : 0.459 
##   tmin : 0.356 
##   dem : 0.0108 
##   Winter_NDVI : 0.00657 
## 
## ------------------------
## Eigenvalue: 1.97064586234132e-15 
## Condition Index: 54184234.4382929 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   shrub_scrub : 0.508 
##   evergreen_forest : 0.5 
##   herbaceous : 0.473 
##   cultivated_crops : 0.318 
##   deciduous_forest : 0.207 
## 
## ------------------------
corrplot::corrplot(corr.matrix)

Feature Engineering

Setup

First, create a space to output any “engineered” rasters:

output.dir <- "artifacts/feature_engineering"
if (!dir.exists(output.dir)) dir.create(output.dir)

Use Hierarchical Categories for Land Cover

Recall the hierarchical categories for the land cover data:

  • Water:
    • 11: Open Water
    • 12: Perennial Ice/Snow
  • Developed
    • 21: Developed, Open Space
    • 22: Developed, Low Intensity
    • 23: Developed, Medium Intensity
    • 24: Developed, High Intensity
  • Barren
    • 31: Barren Land (Rock/Sand/Clay)
  • Forest
    • 41: Deciduous Forest
    • 42: Evergreen Forest
    • 43: Mixed Forest
  • Shrubland
    • 51: Dwarf Shrub
    • 52: Shurb/Scrub
  • Herbaceous
    • 71: Grassland/Herbaceous
    • 72: Sedge/Herbaceous
    • 73: Lichens
    • 74: Moss
  • Planted/Cultivated
    • 81: Pasture/Hay
    • 82: Cultivated Crops
  • Wetlands
    • 90: Woody Wetlands
    • 95: Emergent Herbaceous Wetlands

Using these categories, feature reduction can be accomplished using a heuristic methodology. Other redundant rasters, such as the Waterbody and Urban Imperviousness rasters, can also be combined with the respective related land cover rasters to further reduce multicollinearity between rasters.

create.parent.category.rasters <- function(input.raster, 
                                           state, 
                                           output.dir,
                                           layer.name="NLCD_Land") {
  # Define the category mappings
  categories <- list(
    water = c("Open Water"),
    ice.snow = c("Perennial Ice/Snow"),
    developed = c("Developed, Open Space",
                  "Developed, Low Intensity",
                  "Developed, Medium Intensity",  
                  "Developed, High Intensity"),
    barren = c("Barren Land"),
    forest = c("Mixed Forest", 
               "Deciduous Forest", 
               "Evergreen Forest"),
    shrubland = c("Shrub/Scrub", "Dwarf Shrub"),
    herbaceous = c("Herbaceous", "Grassland/Herbaceous",
                   "Sedge/Herbaceous", "Lichens", "Moss"),
    planted.cultivated = c("Cultivated Crops", "Hay/Pasture"),
    wetlands = c("Woody Wetlands", "Emergent Herbaceous Wetlands")
  )
  
  # Iterate through each category to create and save the binary raster
  for (cat.name in names(categories)) {
    # Define the output file path
    output.file <- file.path(output.dir, paste0(state, "_", cat.name, ".tif"))
    
    if (!file.exists(output.file)) {
      cat("Generating raster for", cat.name, "in", state, "\n")
      if (cat.name != "developed") {
        # Create a binary raster for the current category
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x %in% categories[[cat.name]] ~ 1.,
                                      T ~ 0)
                                  })
        names(out.raster) <- cat.name
        if (cat.name == "water") {
          # Combine with waterbody raster layer
          wb.raster <- input.raster[[paste0("waterbody_", state)]]
          out.raster <- (out.raster + wb.raster) / 2
          names(out.raster) <- "water"
        }
      } else {
        # Set developed raster to be a value, scale ranging from 0.25-1 by 0.25
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x == "Developed, Open Space" ~ 0.25,
                                      x == "Developed, Low Intensity" ~ 0.5,
                                      x == "Developed, Medium Intensity" ~ 0.75,
                                      x == "Developed, High Intensity" ~ 1.,
                                      T ~ 0)
                                  })
        # Combine with urban imperviousness
        ui.raster <- input.raster[[paste0("urban_imperviousness_", state)]]
        ui.min <- terra::minmax(ui.raster)[[1]]
        ui.max <- terra::minmax(ui.raster)[[2]]
        ui.raster.scaled <- (ui.raster - ui.min) / (ui.max - ui.min)
        out.raster <- (out.raster + ui.raster.scaled) / 2
        names(out.raster) <- "developed"
      }
      
      # Save the output raster to the specified output path
      writeRaster(out.raster, output.file, overwrite=T)
    }
  }
}

for (state in states) {
  create.parent.category.rasters(r.list[[state]], state, output.dir)
}

Aggregate Seasonal NDVI Rasters (Min/Max)

# Convert 4 separate NDVI rasters to a single raster
for (state in states) {
  output.file.min <- file.path(output.dir, paste0(state, "_min_NDVI.tif"))
  output.file.max <- file.path(output.dir, paste0(state, "_max_NDVI.tif"))
  if (!file.exists(output.file.min) | !file.exists(output.file.max)) {
    r.ndvi <- r.list[[state]][[names(r.list[[state]]) %like% "NDVI"]]
    # Parse seasons
    for (s in names(r.ndvi)) {
      r <- r.ndvi[[s]]
      .min <- terra::minmax(r)[[1]]
      .max <- terra::minmax(r)[[2]]
      # Scale to be from 0 to 1
      r.ndvi[[s]] <- (r - .min) / (.max - .min)
    }
    # Get min/max scaled NDVI values
    r.max <- max(r.ndvi)
    names(r.max) <- "max.ndvi"
    r.min <- min(r.ndvi)
    names(r.min) <- "min.ndvi"
    writeRaster(r.max, output.file.max, overwrite=T)
    writeRaster(r.min, output.file.min, overwrite=T)
  }
}

Combine Temperature Rasters (Difference)

# Get the difference between the min and max temperatures as a raster
for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_tdiff.tif"))
  if (!file.exists(output.file)) {
    # Get difference
    r.tdiff <- r.list[[state]][[paste0("tmax_", state)]] - 
      r.list[[state]][[paste0("tmin_", state)]]
    # Get min/max differences
    .min <- terra::minmax(r.tdiff)[[1]]
    .max <- terra::minmax(r.tdiff)[[2]]
    # Scale to be from 0 to 1
    r.tdiff <- (r.tdiff - .min) / (.max - .min)
    names(r.tdiff) <- "temp.diff"
    # Write raster
    writeRaster(r.tdiff, output.file, overwrite=T)
  }
}

Spatial Filters

These are components derived from the spatial coordinates, which can capture and account for spatial structures in the data. E.g., a polynomial term based on latitude and longitude could account for some of the spatial trend.

for (state in states) {
  output.file.lon <- file.path(output.dir, paste0(state, "_lon.tif"))
  output.file.lat <- file.path(output.dir, paste0(state, "_lat.tif"))
  output.file.lon2 <- file.path(output.dir, paste0(state, "_lon_poly.tif"))
  output.file.lat2 <- file.path(output.dir, paste0(state, "_lat_poly.tif"))
  output.file.lli <- file.path(output.dir, paste0(state, "_lon_lat_interaction.tif"))
  if (!all(file.exists(c(output.file.lon, output.file.lat, 
                         output.file.lon2, output.file.lat2,
                         output.file.lli)))) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y")
    
    # Initialize empty raster templates
    r.lat <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    r.lon <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with lat/lon values
    r.lat[r.df$cell] <- r.df$lat
    names(r.lat) <- "lat"
    r.lon[r.df$cell] <- r.df$lon
    names(r.lon) <- "lon"
    # Get polynomial and interaction terms
    r.lat2 <- r.lat^2 
    names(r.lat2) <- "lat.sqrd"
    r.lon2 <- r.lon^2 
    names(r.lon2) <- "lon.sqrd"
    r.lon.lat <- r.lon * r.lat
    names(r.lon.lat) <- "lon.lat.interaction"
    
    # Write outputs
    writeRaster(r.lat, output.file.lat, overwrite=T)
    writeRaster(r.lon, output.file.lon, overwrite=T)
    writeRaster(r.lat2, output.file.lat2, overwrite=T)
    writeRaster(r.lon2, output.file.lon2, overwrite=T)
    writeRaster(r.lon.lat, output.file.lli, overwrite=T)
  }
}

De-trend DEM

for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_detrend_dem.tif"))
  if (!file.exists(output.file)) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell, !!sym(paste0("dem_", state))) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y", dem=paste0("dem_", state)) %>%
      # Convert to data.table
      setDT()
    
    # Fit model based on location, with dem as response
    fit <- lm(dem ~ lat * lon + I(lat^2) + I(lon^2), 
              data=r.df[!is.na(dem)])
    # Get residuals from the model as "de-trended" dem values
    r.df[!is.na(dem), dem.detrended := residuals(fit)]
    # Scale de-trended values
    r.df[, dem.detrended := (dem.detrended - min(dem.detrended, na.rm=T)) / 
           (max(dem.detrended, na.rm=T) - min(dem.detrended, na.rm=T))]
    # Initialize empty raster template
    r.dem <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with de-trended dem values
    r.dem[r.df$cell] <- r.df$dem.detrended
    names(r.dem) <- "dem.detrended"
    
    # Write outputs
    writeRaster(r.dem, output.file, overwrite=T)
  }
}

Feature Selection

Load all Original Features and Feature Engineering Outputs

final.output.dir <- "artifacts/final_data"
if (!dir.exists(final.output.dir)) dir.create(final.output.dir)
final.fpath <- file.path(final.output.dir, "final_data.rds")

if (!file.exists(final.fpath)) {
  # Combine observation/pseudo-absence data
  df <- c(list.files(file.path("data", "final_pseudo_absence"), full.names = T),
          list.files(file.path("data", "final"), full.names = T)) %>%
    purrr::map_df(readRDS)
  
  # Get feature engineered raster data
  output.dir <- "artifacts/feature_engineered_final"
  if (!dir.exists(output.dir)) dir.create(output.dir)
  get.fe <- all(case_when(is_empty(list.files(output.dir)) ~ T,
                          list.files(output.dir) < length(states) ~ T,
                          T ~ F))
  fe.r.list <- list()
  if (get.fe) {
    for (state in states) {
      fe.r.list[[state]] <- list.files(
        "artifacts/feature_engineering", full.names = T)[
          grepl(
            paste0("^", state, "_"), 
            list.files("artifacts/feature_engineering", full.names = F)
          )
        ] %>% rast()
      # Save as multi-layer rasters by state
      writeRaster(fe.r.list[[state]], 
                  file.path(output.dir, paste0(state, ".tif")),
                  overwrite=T)
    }
  } else {
    for (state in states) {
      fe.r.list[[state]] <- rast(file.path(output.dir, paste0(state, ".tif")))
    }
  }
  
  # Add empty fields in dataframe for each new raster 
  purrr::map(fe.r.list, names) %>% 
    reduce(c) %>% 
    unique() %>% 
    sort() %>%
    purrr::walk(function(n) {
      if (!(n %in% names(df))) df[[n]] <<- 0
    })
  
  # Update point crs to match fe rasters
  df <- st_transform(df, st_crs(fe.r.list[[1]])) 
  df.list <- list()
  # From state multi-layer rasters, extract values from each point in df
  for (st in states) {
    cat(sprintf("Extracting points to values for %s...\n", st))
    r <- fe.r.list[[st]]
    df.list[[st]] <- df %>% filter(state == st)
    # Extract raster values
    for (r.name in names(r)) {
      cat("\tExtracting", r.name, "values for", st, "\n")
      x <- terra::extract(r[[r.name]], df.list[[st]])[[r.name]]
      df.list[[st]] <- mutate(df.list[[st]], !!r.name := x)
    }
  }
  # Combine list
  df <- do.call("rbind", df.list)
  rm(df.list)
  gc()
  # Update crs back
  df <- st_transform(df, 4326)
  # Remove rownames
  rownames(df) <- NULL
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=df, sep="") %>% 
    predict(., df) %>%
    as.data.frame()
  
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  # Remove duplicates from feature engineering
  binary.lc <- binary.lc %>%
    select(-c("unclassified", "perennial_snow_ice", "barren_land",
              "shrub_scrub", "herbaceous"))
  # Combine with dataframe, remove land cover categorical var
  df <- df %>% select(-NLCD_Land) %>% cbind(binary.lc)
  saveRDS(df, final.fpath)
} else {
  df <- readRDS(final.fpath)
}

# Convert to data.table
df %>% setDT()

# View output
df %>% as_tibble()

Correlation

obs.cor <- purrr::map(sort(unique(df$common.name)), function(spec) {
  corr.matrix <- cor(select(df[common.name == spec], 
                            -c("state", "common.name", "geometry")))
  obs.cor <- corr.matrix[which(rownames(corr.matrix) == "observations"),] %>%
    as.data.frame()
  obs.cor$variable <- rownames(obs.cor)
  obs.cor %>%
    filter(variable != "observations") %>%
    rename(!!spec := `.`) %>%
    arrange(-abs(!!sym(spec))) %>%
    mutate(!!spec := paste0(variable, ": ", signif(!!sym(spec), 2))) %>%
    select(!!sym(spec)) 
}) %>% 
  do.call("cbind", .)

obs.cor %>% as_tibble()

Train/Test Split

# Set seed for splitting and modeling
set.seed(19)

stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
  # Cut along lat/lon values to create grids (lat.bin & lon.bin)
  # lat.lon.bins is the number of divisions you want
  df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
  df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
  df$absence <- df$observations == 0
  
  # Create a new variable combining the stratification variables
  df %>%
    # stratify on lat/lon bins, species, state, and absence
    mutate(strata = paste(lat.bin, lon.bin, common.name, state, absence)) %>%
    pull(strata) %>%
    # Create the data partitions
    createDataPartition(., p = p, list = F) %>%
    suppressWarnings()
}

prepare.data <- function(df, p=.7, lat.lon.bins=25) {
  train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
  df.train <- df[train.index, ]
  df.test <- df[-train.index, ]
  
  list(train = df.train, 
       test = df.test,
       index = train.index)
}

# Get train/test indices
train.test <- prepare.data(df, .7)

# Split; Remove non-predictive variables
df.train <- df[train.test$index,] 
df.train[, `:=` (geometry=NULL)]
df.test <- df[-train.test$index,]
df.test[, `:=` (geometry=NULL)]

train.x <- df.train %>% select(-observations)
train.y <- df.train$observations

test.x <- df.test %>% select(-observations)
test.y <- df.test$observations
# Get model from cache if it has been saved before
get.model <- function(model, file.name, model.path) {
  f.path <- file.path(model.path, file.name)
  if (!dir.exists(model.path)) {
    dir.create(model.path)
  }
  # Model cache check
  if (ifelse(file.exists(f.path), T, F)) {
    model <- readRDS(f.path)
  } else {
    saveRDS(model, f.path)
  }
  model
}

LASSO Models - Importance from Coefficients

species <- sort(unique(df.train$common.name))
if (!dir.exists("artifacts/models")) dir.create("artifacts/models")

# Define the control for the train function
ctrl <- trainControl(method = "cv", number = 5)

lasso.list <- purrr::map(species, function(spec) {
  spec.state.fit <- purrr::map(states, function(st) {
    cat("Fitting LASSO model for", spec, "in", st, "\n")
    spec.df <- df.train[common.name == spec & state == st][
      , `:=` (state=NULL, common.name = NULL)]
    
    # Remove any columns where all values are the same
    .remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
    .remove <- .remove[.remove != "observations"]
    if (!is_empty(.remove)) {
      spec.df <- spec.df %>% select(-.remove)
    }
    
    # Scale data
    # Identify fields to center/scale
    to.scale <- sapply(spec.df, function(x) is.numeric(x) && 
                         length(unique(x)) > 2)
    to.scale$observations <- F
    to.scale <- names(spec.df) %in% names(which(unlist(to.scale)))
    
    # Define the pre-processing steps (use the training data to avoid data leakage)
    # Apply centering and scaling to the non-binary fields and non-target
    preproc <- preProcess(spec.df[, ..to.scale], 
                          method = c("center", "scale"))
    
    # Center/Scale the training data
    df.train.scaled <- bind_cols(spec.df[, !(..to.scale)],
                                 predict(preproc, spec.df[, ..to.scale]))
    
    # LASSO (L1); Elastic Net, where alpha = 1
    fit.l1 <- get.model(
      train(observations ~ (.)^2, 
            data = df.train.scaled, 
            method = "glmnet",
            trControl = ctrl,
            tuneGrid = expand.grid(alpha = 1, 
                                   lambda = seq(0, 1, by = 0.1)),
            metric = "RMSE"),
      file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_regr_l1.rds"),
      model.path="artifacts/models/lasso_fs")
    
    gc()
    fit.l1
  })
  names(spec.state.fit) <- states
  spec.state.fit
})
## Fitting LASSO model for Belted Kingfisher in CO 
## Fitting LASSO model for Belted Kingfisher in NC 
## Fitting LASSO model for Belted Kingfisher in OR 
## Fitting LASSO model for Belted Kingfisher in VT 
## Fitting LASSO model for Cedar Waxwing in CO 
## Fitting LASSO model for Cedar Waxwing in NC 
## Fitting LASSO model for Cedar Waxwing in OR 
## Fitting LASSO model for Cedar Waxwing in VT 
## Fitting LASSO model for Downy Woodpecker in CO 
## Fitting LASSO model for Downy Woodpecker in NC 
## Fitting LASSO model for Downy Woodpecker in OR 
## Fitting LASSO model for Downy Woodpecker in VT 
## Fitting LASSO model for Ruddy Duck in CO 
## Fitting LASSO model for Ruddy Duck in NC 
## Fitting LASSO model for Ruddy Duck in OR 
## Fitting LASSO model for Ruddy Duck in VT 
## Fitting LASSO model for Sanderling in CO 
## Fitting LASSO model for Sanderling in NC 
## Fitting LASSO model for Sanderling in OR 
## Fitting LASSO model for Sanderling in VT 
## Fitting LASSO model for Sandhill Crane in CO 
## Fitting LASSO model for Sandhill Crane in NC 
## Fitting LASSO model for Sandhill Crane in OR 
## Fitting LASSO model for Sandhill Crane in VT 
## Fitting LASSO model for Sharp-shinned Hawk in CO 
## Fitting LASSO model for Sharp-shinned Hawk in NC 
## Fitting LASSO model for Sharp-shinned Hawk in OR 
## Fitting LASSO model for Sharp-shinned Hawk in VT 
## Fitting LASSO model for Wild Turkey in CO 
## Fitting LASSO model for Wild Turkey in NC 
## Fitting LASSO model for Wild Turkey in OR 
## Fitting LASSO model for Wild Turkey in VT
names(lasso.list) <- species

spec.state <- expand.grid(common.name=species, state=states, stringsAsFactors=F)

lasso.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
  spec <- spec.state[i,]$common.name
  st <- spec.state[i,]$state
  fit <- lasso.list[[spec]][[st]]
  coef.df <- coef(fit$finalModel, s = fit$bestTune$lambda) %>%
    as.matrix() %>%
    as.data.frame()
  # Remove the intercept
  coef.df <- coef.df[-1, , drop = F]
  
  # Create a data frame of variable importance
  var.importance <- tibble(
    common.name = spec,
    state = st,
    variable = rownames(coef.df),
    importance = abs(coef.df[,1])
  ) %>%
    # Rank variables by importance
    arrange(state, common.name, -importance, variable) %>%
    # Only look at variables where imp. > 0
    filter(importance > 0)
})

lasso.imp

Decision Tree Models - Variable Importance

# Decision Tree Variable Importance

tree.list <- purrr::map(species, function(spec) {
  spec.state.fit <- purrr::map(states, function(st) {
    cat("Fitting Decision Tree model for", spec, "in", st, "\n")
    spec.df <- df.train[common.name == spec & state == st][
      , `:=` (state=NULL, common.name = NULL)]
    
    # Remove any columns where all values are the same
    .remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
    .remove <- .remove[.remove != "observations"]
    if (!is_empty(.remove)) {
      spec.df <- spec.df %>% select(-.remove)
    }

    # Fit the Decision Tree model
    fit.tree <- get.model(
      rpart::rpart(observations ~ ., data=spec.df, method="anova",
                   control=rpart::rpart.control(cp=0.001)),
      file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_tree.rds"),
      model.path="artifacts/models/trees_fs")
    
    gc()
    fit.tree
  })
  names(spec.state.fit) <- states
  spec.state.fit
})
## Fitting Decision Tree model for Belted Kingfisher in CO 
## Fitting Decision Tree model for Belted Kingfisher in NC 
## Fitting Decision Tree model for Belted Kingfisher in OR 
## Fitting Decision Tree model for Belted Kingfisher in VT 
## Fitting Decision Tree model for Cedar Waxwing in CO 
## Fitting Decision Tree model for Cedar Waxwing in NC 
## Fitting Decision Tree model for Cedar Waxwing in OR 
## Fitting Decision Tree model for Cedar Waxwing in VT 
## Fitting Decision Tree model for Downy Woodpecker in CO 
## Fitting Decision Tree model for Downy Woodpecker in NC 
## Fitting Decision Tree model for Downy Woodpecker in OR 
## Fitting Decision Tree model for Downy Woodpecker in VT 
## Fitting Decision Tree model for Ruddy Duck in CO 
## Fitting Decision Tree model for Ruddy Duck in NC 
## Fitting Decision Tree model for Ruddy Duck in OR 
## Fitting Decision Tree model for Ruddy Duck in VT 
## Fitting Decision Tree model for Sanderling in CO 
## Fitting Decision Tree model for Sanderling in NC 
## Fitting Decision Tree model for Sanderling in OR 
## Fitting Decision Tree model for Sanderling in VT 
## Fitting Decision Tree model for Sandhill Crane in CO 
## Fitting Decision Tree model for Sandhill Crane in NC 
## Fitting Decision Tree model for Sandhill Crane in OR 
## Fitting Decision Tree model for Sandhill Crane in VT 
## Fitting Decision Tree model for Sharp-shinned Hawk in CO 
## Fitting Decision Tree model for Sharp-shinned Hawk in NC 
## Fitting Decision Tree model for Sharp-shinned Hawk in OR 
## Fitting Decision Tree model for Sharp-shinned Hawk in VT 
## Fitting Decision Tree model for Wild Turkey in CO 
## Fitting Decision Tree model for Wild Turkey in NC 
## Fitting Decision Tree model for Wild Turkey in OR 
## Fitting Decision Tree model for Wild Turkey in VT
names(tree.list) <- species

tree.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
  spec <- spec.state[i,]$common.name
  st <- spec.state[i,]$state
  fit <- tree.list[[spec]][[st]]
  vi <- fit$variable.importance
  # Create a data frame of variable importance
  var.importance <- tibble(
    common.name = spec,
    state = st,
    variable = names(vi),
    importance = (vi - min(vi)) / (max(vi) - min(vi))
  ) %>%
    # Rank variables by importance
    arrange(state, common.name, -importance, variable)
})

tree.imp